library(tidyverse)
library(tidytext)
library(stringi)
library(stringr)
library(readxl)
library(qdapRegex)
library(readr)
library(tidygraph)
library(graphlayouts)
library(igraph)
library(ggraph)
library(writexl)

############### UPLOADING AND DATA HANDLE

# Load data and stopwords created for this project

noticias_filtradas <- read_csv("input/noticias_filtradas.csv")
stop <- read_csv("input/stop.csv")
words_scraping <- read_excel("input/words_scraping.xlsx", 
                             sheet = "Hoja1")
actores_sociales <- read_excel("input/actores_sociales.xlsx")

## function to remove accents on the text

f_remove_accent <- function(x){
  x %>% 
    str_replace_all("á", "a") %>% 
    str_replace_all("é", "e") %>% 
    str_replace_all("í", "i") %>% 
    str_replace_all("ó", "o") %>% 
    str_replace_all("ú", "u") %>% 
    str_replace_all("ñ", "n") # also replace "ñ", a common letter in Spanish
}

# apply changes
noticias_filtradas <- noticias_filtradas %>% 
  mutate(cuerpo2 = cuerpo %>%
           # delete user names (which start with @):
           str_remove("\\@[[:alnum:]]+") %>% 
           # delete URLs:
           str_remove_all("http[\\w[:punct:]]+") %>% 
           # all text to lowercase:
           str_to_lower() %>%
           # remove special characters:
           str_remove_all("[\\d\\.,_\\@]+") %>% 
           f_remove_accent() %>%
           # remove emojis
           rm_non_ascii() 
  )



words_scraping$palabra1 <- f_remove_accent(words_scraping$palabra1)
words_scraping$palabra2 <- f_remove_accent(words_scraping$palabra2)
words_scraping$palabra1 <- str_to_lower(words_scraping$palabra1)
words_scraping$palabra2 <- str_to_lower(words_scraping$palabra2)

######### TOKENIZATION OF THE DATA ####################

# For Education
lineas_filtrado2 <- noticias_filtradas %>% 
  unnest_tokens(ngram, cuerpo2, token = "ngrams", n = 2) %>%
  select(2,5) %>% separate(ngram, c("word1", "word2"), sep = " ")

#Now on the news nested for each category

#Higher Education
noticias_filtradas2 <- noticias_filtradas %>% filter(str_detect(cuerpo2, "educacion|educacional|superior|establecimientos educacionales|educacion superior|educacion universitaria|
                                                             instituto profesional|educacion tecnica|universidad|universidades|liceo|colegio|institucion educativa"))

#Reforms of free college tuiton policy
noticias_filtradas3 <- noticias_filtradas2 %>% filter(str_detect(cuerpo2, "reforma gratuidad|reforma financiamiento|gratuidad universal|reforma educacion|
                                                                gratuidad universitaria|educacion gratuida"))

#Issues and actors
noticias_filtradas4 <- noticias_filtradas3 %>% filter(str_detect(cuerpo2, "movimiento estudiantil|dirigente estudiantil|dirigentes estudiantiles|
                                                                daniel andrade|marcha estudiantil|protesta estudiantil|movilizacion estudiantil|
                                                                universidades estatales|formacion tecnica|institutos profesionales|rectores cruch|
                                                                universidades publicas|instituciones privadas|giorgio jackson|camila vallejos|
                                                                gabriel boric|camila rojas|miguel crispi|ministerio educacion|ministro educacion|
                                                                nicolas eyzaguirre|ministra educacion|adriana delpiano|gerardo varela|marcela cubillos|
                                                                raul figueroa|presidenta bachelet|presidente pinera|rodrigo valdes|valentina quiroga|
                                                                juan vargas|eduardo vargas|juan eduardo|roxana pey|claudia periano"))


## Now me separate and tokenize the news
lineas_filtrado3 <- noticias_filtradas2 %>% unnest_tokens(ngram, cuerpo2, token = "ngrams", n = 2) %>%
  select(2,5) %>% separate(ngram, c("word1", "word2"), sep = " ")

lineas_filtrado4 <- noticias_filtradas3 %>% unnest_tokens(ngram, cuerpo2, token = "ngrams", n = 2) %>%
  select(2,5) %>% separate(ngram, c("word1", "word2"), sep = " ")

lineas_filtrado5 <- noticias_filtradas4 %>% unnest_tokens(ngram, cuerpo2, token = "ngrams", n = 2) %>%
  select(2,5) %>% separate(ngram, c("word1", "word2"), sep = " ")

## First, we will look into all the bigrams.

#Only bigrams, without NA in the second word
words_scraping_sin_na <-na.omit(words_scraping)

#The different levels. In the paper, que define 4 levels: 1) education, 2)Higher Education, 3) Free College Tuiton Policy, 4) Issues and Actors
niveles <-unique(words_scraping$nivel)

#We check all levels are in
length(niveles)

############################## MATCHING OF BIGRAMS WITH NO NA'S WORDS IN THE COLUMNS #######################

############# For the levels 1 and 2

#Create an empty dataframe
df_filtrados_bigrams <- data.frame()

#In the loop, we match the words of the news, with the one setted by us
for (i in 1:2) {
  
  words_scraping_filter <- words_scraping_sin_na %>%
    filter(nivel==i)
  
  lineas_filtrado_bigrams <- lineas_filtrado2 %>%
    filter(word1 %in% words_scraping_filter$palabra1 & word2 %in% words_scraping_filter$palabra2) %>%
    mutate(nivel=i)
  
  
  df_filtrados_bigrams <- rbind(df_filtrados_bigrams,lineas_filtrado_bigrams)
  
}

########### For the level 3

#Create an empty dataframe
df_filtrados_bigrams3 <- data.frame()

for (i in 3) {
  
  words_scraping_filter <- words_scraping_sin_na %>%
    filter(nivel==i)
  
  lineas_filtrado_bigrams2 <- lineas_filtrado4 %>%
    filter(word1 %in% words_scraping_filter$palabra1 & word2 %in% words_scraping_filter$palabra2) %>%
    mutate(nivel=i)
  
  
  df_filtrados_bigrams3 <- rbind(df_filtrados_bigrams3,lineas_filtrado_bigrams2)
  
}

#Now we rebind the dataframe with levels 1 and 2, with the one of the level 3
df_filtrados_bigrams <- rbind(df_filtrados_bigrams, df_filtrados_bigrams3)

####################### For the level 4

df_filtrados_bigrams4 <- data.frame()

for (i in 4) {
  
  words_scraping_filter <- words_scraping_sin_na %>%
    filter(nivel==i)
  
  lineas_filtrado_bigrams3 <- lineas_filtrado5 %>%
    filter(word1 %in% words_scraping_filter$palabra1 & word2 %in% words_scraping_filter$palabra2) %>%
    mutate(nivel=i)
  
  
  df_filtrados_bigrams4 <- rbind(df_filtrados_bigrams4,lineas_filtrado_bigrams3)
  
}

#Rebind of all the dataframes
df_filtrados_bigrams <- rbind(df_filtrados_bigrams, df_filtrados_bigrams4)

#We check that we have all 4 levels
unique(df_filtrados_bigrams$nivel)


################################# MATCHING OF BIGRAMS WITH NA'S IN THE SECOND WORD ###################

#We create the dataframe containing the single words
words_scraping_con_na <- words_scraping %>%
  filter(is.na(palabra2))

#We set the levels. In this ocassion, we don't have single words on level 3
niveles2 <- unique(words_scraping_con_na$nivel)

############## FOR LEVEL 1 AND 2

#Empty dataframe
df_filtrados_bigrams5 <- data.frame()


#Loop for level 1 and 2 
for (i in 1:2) {
  
  words_scraping_filter_na <- words_scraping_con_na %>%
    filter(nivel==i)
  
  lineas_filtrado_na <- lineas_filtrado2 %>%
    filter(word1 %in% words_scraping_filter_na$palabra1) %>%
    filter(!word2 %in% words_scraping_sin_na$palabra2) %>%
    mutate(nivel=i)
  
  
  df_filtrados_bigrams5 <- rbind(df_filtrados_bigrams5,lineas_filtrado_na)
  
  
  
  
}

############# Level 4

#Empty dataframe
df_filtrados_bigrams6 <- data.frame()

for (i in 4) {
  
  words_scraping_filter_na <- words_scraping_con_na %>%
    filter(nivel==i)
  
  lineas_filtrado_na2 <- lineas_filtrado_bigrams3 %>%
    filter(word1 %in% words_scraping_filter_na$palabra1) %>%
    filter(!word2 %in% words_scraping_sin_na$palabra2) %>%
    mutate(nivel=i)
  
  
  df_filtrados_bigrams6 <- rbind(df_filtrados_bigrams6,lineas_filtrado_na2)
  
  
  
  
}

#And now we rebind the two dataframes
df_filtrados_bigram_final <- rbind(df_filtrados_bigrams5, df_filtrados_bigrams6)

#Unify everything in one single dataframe
df_principal <-rbind(df_filtrados_bigrams, df_filtrados_bigram_final)

####### Finally, we add one last category of "Student Movement"

df  <- df_principal %>%
  left_join(words_scraping %>% select(3,4) %>% distinct(), by="nivel")

resto_n <- df %>% filter(nivel %in% (1:3))
n_4 <- df %>% filter(nivel==4)
n_estudiantil  <- df %>%  filter(word1 %in% actores_sociales$word1, word2 %in% word2) %>%
  mutate(n_nivel="Student movement")
n_estudiantil$nivel[n_estudiantil$nivel==4]=5
nn4 <- anti_join(n_4, n_estudiantil, by ="word1")
nn5 <- n_4 %>% filter(!word1 %in% nn4$word1, word2 %in% nn4$word2)
nn6 <- n_4 %>% filter(!word1 %in% nn5$word1, word2 %in% nn5$word2)
base_4 <- rbind(nn4, nn5)
base_5 <- rbind(base_4, nn6)

df2 <- df  %>% filter(!word1 %in% stop$palabra, !word2 %in% stop$palabra, !word1 %in% quanteda::stopwords(language = "spa"), !word2 %in% !word1 %in% quanteda::stopwords(language = "spa"))
df2 <- df2 %>% mutate(year=lubridate::year(date)) %>% filter(year>='2015' & year<='2021')
resto_niveles <- df2 %>% filter(nivel %in% (1:3))
unique(resto_niveles$nivel)
nivel4 <- df2 %>% filter(nivel==4)
nivel4_singrup <- nivel4 %>% filter(!word1 %in% actores_sociales$word1) %>% filter(!word2 %in% actores_sociales$word2) %>% mutate(n_nivel="Student movement")
nivel4_singrup$nivel[nivel4_singrup$nivel==4]=5

nivel4_singrup<- nivel4_singrup %>% select(1:5)
base_prueba <- rbind(resto_n, n_4, nivel4_singrup)

df1 <- base_prueba  %>% filter(!word1 %in% stop$palabra,
                               !word2 %in% stop$palabra, !word1 %in% quanteda::stopwords(language = "spa"), !word2 %in% !word1 %in% quanteda::stopwords(language = "spa"))  

## Now me change the names of the category for ploting TIMELINE PLOT

df1$n_nivel[df1$n_nivel=="Educación"]="Education"
df1$n_nivel[df1$n_nivel=="Educación Superior"]="Higher Education"
df1$n_nivel[df1$n_nivel=="Política de gratuidad"]="Free College Tuition Policy"
df1$n_nivel[df1$n_nivel=="Actores y asuntos de la gratuidad"]="Issues and Actors in Free College Tuition"


plt <- df1 %>% # Año 
  mutate(year=lubridate::year(date)) %>%
  count(year, n_nivel) %>%
  filter(year>='2015' & year<='2021') %>% mutate(n_nivel= factor(n_nivel, levels = c("Education", "Higher Education","Free College Tuition Policy","Issues and Actors in Free College Tuition", "Student movement"))) %>%# aquí debo jugar con las fechas 
  ggplot(aes(year, n, color = n_nivel)) + 
  geom_line(aes(linetype = n_nivel), size=1)+ scale_color_grey()+ scale_linetype_manual(values = c("solid","11", "dashed", "F1", "dotted"))+ #scale_color_manual(values = c("darkred","orange","steelblue","black")) +
  xlab("Year") + ylab("News associated to Bigrams") + labs(linetype = "News Topics") + theme_bw() + guides(color = FALSE)    

plt 


############## FREQUENCY OF NEWS PLOT

educacion_g1 <- noticias_filtradas %>% select(2) %>% mutate(n_nivel = "Education")
educacion_sup <- noticias_filtradas2 %>% select(2) %>% mutate(n_nivel = "Higher Education")
gratuidad <- noticias_filtradas3 %>% select(2)  %>% mutate(n_nivel = "Free College Tuition Policy")
actores <- noticias_filtradas4 %>% select(2) %>% mutate(n_nivel = "Issues and Actors in Free College Tuition")

freq_noticias <- rbind(educacion_g1, educacion_sup, gratuidad, actores)

# BARCHART PLOT
freq_noticias <- freq_noticias %>% mutate(year=lubridate::year(date)) %>% filter(year>='2015' & year<='2021')

tabla1 <- freq_noticias %>% group_by(n_nivel, year) %>% tally()

tabla1 <- tabla1 %>% mutate(n_nivel= factor(n_nivel, levels = c("Education", "Higher Education","Free College Tuition Policy","Issues and Actors in Free College Tuition"))) %>%
  pivot_wider(names_from = year, values_from = n) 

tabla1[is.na(tabla1)] = 0

tabla1$total <-  rowSums(tabla1[,c(-1)])


p3 <- freq_noticias %>% group_by(year, n_nivel) %>%
  count(n_nivel) %>%
  mutate(n_nivel= factor(n_nivel, levels = c("Education", "Higher Education","Free College Tuition Policy","Issues and Actors in Free College Tuition"))) %>%
  ggplot(aes(year, n, fill=n_nivel)) + geom_bar(position="dodge", stat = "identity") +scale_fill_grey() + scale_x_continuous(breaks =2015:2021) + 
  xlab("Year") + ylab("Frequency of News")+ labs(fill = "News Topics") + theme_light() +
  geom_text(aes(label=n), position=position_dodge(width=1), vjust=0, size=2)


p3

